library(tidyverse)
library(ggplot2)
library(dplyr)
library(readr)
library(ggthemes)
library(caret)
library(randomForest)
library(pROC)
library(car)
library(broom)
library(reshape2)
library(plotly)
library(patchwork)
library(tidyr)
library(corrplot)
library(glmnet)
library(randomForest)
raw_df <- read_csv("../dataset/FoodAccessResearchAtlasData2019.csv")
str((raw_df))
df_fixed <- raw_df %>%
mutate(across(where(is.character), ~ na_if(., "NULL")))
missing_pct <- colMeans(is.na(df_fixed))
df_clean <- df_fixed[, missing_pct <= 0.10]
str(df_clean)
According to the USDA definitions, a “food desert” is typically a low-income tract that also has low access to supermarkets based on established distance criteria. Here, we use the ‘LILATracts_1And10’ column (which applies a 1-mile threshold for urban and a 10-mile threshold for rural areas) as an indicator. We assume that a value of 1 in ‘LILATracts_1And10’ indicates that the tract qualifies as a food desert.
df$FoodDesert <- as.numeric(as.character(df$LILATracts_1And10))
state_urban_counts <- df %>%
group_by(State, Urban) %>%
summarise(FoodDesert = sum(FoodDesert, na.rm = TRUE), .groups = 'drop')
food_desert_pivot <- state_urban_counts %>%
pivot_wider(names_from = Urban, values_from = FoodDesert, values_fill = 0) %>%
rename(`Rural Food Desert Count` = `0`, `Urban Food Desert Count` = `1`) %>%
mutate(`Total Food Desert Count` = `Rural Food Desert Count` + `Urban Food Desert Count`) %>%
arrange(desc(`Total Food Desert Count`))
head(food_desert_pivot, 10)
state_totals <- df %>%
group_by(State) %>%
summarise(
TotalFoodDeserts = sum(FoodDesert, na.rm = TRUE)
) %>%
arrange(desc(TotalFoodDeserts))
state_totals_top10 <- head(state_totals, 10)
ggplot(state_totals_top10, aes(x = reorder(State, TotalFoodDeserts), y = TotalFoodDeserts)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(
title = "Top 10 States by Number of Food Deserts",
x = "State",
y = "Number of Food Deserts"
) +
theme_minimal(base_size = 14)
state_summary <- df %>%
group_by(State, Urban) %>%
summarise(
TotalFoodDeserts = sum(FoodDesert, na.rm = TRUE),
TotalTracts = n(),
FoodDesertPercentage = 100 * TotalFoodDeserts / TotalTracts,
.groups = 'drop'
)
urban_summary <- state_summary %>%
filter(Urban == 1) %>%
arrange(desc(FoodDesertPercentage))
rural_summary <- state_summary %>%
filter(Urban == 0) %>%
arrange(desc(FoodDesertPercentage))
urban_plot <- ggplot(urban_summary, aes(x = reorder(State, FoodDesertPercentage), y = FoodDesertPercentage)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Urban Food Desert % by State",
x = "State",
y = "Urban Desert %"
) +
theme_minimal(base_size = 12)
rural_plot <- ggplot(rural_summary, aes(x = reorder(State, FoodDesertPercentage), y = FoodDesertPercentage)) +
geom_bar(stat = "identity", fill = "seagreen") +
coord_flip() +
labs(
title = "Rural Food Desert % by State",
x = "State",
y = "Rural Desert %"
) +
theme_minimal(base_size = 12)
urban_plot + rural_plot + plot_layout(ncol = 2)
By far Texas has the most food desert tracts, but it also has a large number of total tracts. Mississippi has the highest percentage of food desert tracts, with an astonishing 32% of its tracts being food deserts. Mississppi also has the highest percentage of food desert tracts in urban areas which contribute to 42% of its total tracts. Arizona and alaska share the highest percentage of food desert tracts in rural areas, with 29%.
foodatlas <- read_csv("../dataset/FoodAccessResearchAtlasData2019.csv", col_types = cols(CensusTract = col_character()))
socioeconomic <- read_csv("../dataset/FE_socioeconomic.csv")
insecurity <- read_csv("../dataset/FE_insecurity.csv")
health <- read_csv("../dataset/FE_health.csv")
stores <- read_csv("../dataset/FE_stores.csv")
restaurants <- read_csv("../dataset/FE_restaurants.csv")
taxes <- read_csv("../dataset/FE_taxes.csv")
local <- read_csv("../dataset/FE_local.csv")
access <- read_csv("../dataset/FE_access.csv")
state_data <- read_csv("../dataset/FE_supplemental_data_state.csv")
county_data <- read_csv("../dataset/FE_supplemental_data_county.csv")
clean_nulls <- function(df) {
df %>% mutate(across(where(is.character), ~ na_if(., "NULL")))
}
foodatlas <- clean_nulls(foodatlas)
socioeconomic <- clean_nulls(socioeconomic)
insecurity <- clean_nulls(insecurity)
health <- clean_nulls(health)
stores <- clean_nulls(stores)
restaurants <- clean_nulls(restaurants)
taxes <- clean_nulls(taxes)
local <- clean_nulls(local)
access <- clean_nulls(access)
state_data <- clean_nulls(state_data)
county_data <- clean_nulls(county_data)
foodatlas <- foodatlas %>%
mutate(CensusTract = str_pad(CensusTract, 11, pad = "0"),
CountyFIPS = substr(CensusTract, 1, 5))
socioeconomic <- socioeconomic %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
insecurity <- insecurity %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
health <- health %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
stores <- stores %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
restaurants <- restaurants %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
taxes <- taxes %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
local <- local %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
access <- access %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
county_data <- county_data %>% mutate(CountyFIPS = str_pad(FIPS, 5, pad = "0"))
# Merge everything into foodatlas
merged_df <- foodatlas %>%
left_join(socioeconomic, by = "CountyFIPS") %>%
left_join(insecurity, by = "CountyFIPS") %>%
left_join(health, by = "CountyFIPS") %>%
left_join(stores, by = "CountyFIPS") %>%
left_join(restaurants, by = "CountyFIPS") %>%
left_join(taxes, by = "CountyFIPS") %>%
left_join(local, by = "CountyFIPS") %>%
left_join(access, by = "CountyFIPS") %>%
left_join(county_data, by = "CountyFIPS")
merged_df$FoodDesert <- as.numeric(as.character(merged_df$LILATracts_1And10))
predictor_vars <- merged_df %>%
select(-FoodDesert) %>%
select(where(is.numeric))
# Drop columns with >10% missing
missing_pct <- colMeans(is.na(predictor_vars))
predictor_vars <- predictor_vars[, missing_pct <= 0.10]
# Impute remaining NA with medians
predictor_vars <- predictor_vars %>%
mutate(across(everything(), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
is_binary <- function(x) {
unique_vals <- unique(na.omit(x))
length(unique_vals) == 2 && all(sort(unique_vals) %in% c(0, 1))
}
binary_cols <- names(predictor_vars)[sapply(predictor_vars, is_binary)]
continuous_cols <- setdiff(names(predictor_vars), binary_cols)
# Convert binary variables to factor
predictor_vars <- predictor_vars %>%
mutate(across(all_of(binary_cols), as.factor))
# Scale continuous variables
predictor_vars <- predictor_vars %>%
mutate(across(all_of(continuous_cols), scale))
# Prepare modeling data
model_data <- bind_cols(
FoodDesert = merged_df$FoodDesert,
predictor_vars
) %>% drop_na()
set.seed(97)
model_data_sampled <- model_data %>% sample_frac(0.3)
str(model_data_sampled)
predictor_vars <- model_data_sampled %>%
select(-c(FoodDesert,LILATracts_1And10,LILATracts_1And20,LILATracts_halfAnd10,`2010_Census_Population`)) %>%
select(where(is.numeric))
y <- model_data_sampled$FoodDesert
univariate_results <- lapply(names(predictor_vars), function(var) {
temp_formula <- as.formula(paste("FoodDesert ~", var))
model <- glm(temp_formula, data = model_data_sampled, family = binomial())
tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(variable = var)
})
univariate_df <- do.call(rbind, univariate_results)
selected_vars_univariate <- univariate_df %>%
filter(p.value < 0.05) %>%
pull(variable)
cat("Variables selected after univariate screening:", length(selected_vars_univariate), "\n")
print(selected_vars_univariate)
X_lasso <- model.matrix(
as.formula(paste("~", paste0("`", selected_vars_univariate, "`", collapse = " + "))),
data = model_data_sampled
)[, -1] # remove intercept
y_lasso <- model_data_sampled$FoodDesert
# Lasso logistic regression
set.seed(72)
lasso_fit <- cv.glmnet(X_lasso, y_lasso, family = "binomial", alpha = 1)
best_lambda <- lasso_fit$lambda.min
lasso_coefs <- coef(lasso_fit, s = best_lambda)
selected_lasso_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0]
selected_lasso_vars <- selected_lasso_vars[selected_lasso_vars != "(Intercept)"]
cat("Variables selected after Lasso:", length(selected_lasso_vars), "\n")
print(selected_lasso_vars)
remove_high_vif_iteratively <- function(data, target = "FoodDesert", threshold = 5) {
current_vars <- setdiff(names(data), target)
repeat {
temp_formula <- as.formula(paste(target, "~", paste0("`", current_vars, "`", collapse = " + ")))
temp_model <- glm(temp_formula, data = data, family = binomial())
vif_values <- vif(temp_model)
if (max(vif_values, na.rm = TRUE) < threshold) {
break
}
worst_var <- names(which.max(vif_values))
cat("Removing variable:", worst_var, "VIF =", round(max(vif_values), 2), "\n")
current_vars <- setdiff(current_vars, worst_var)
}
return(current_vars)
}
model_data_reduced <- model_data_sampled %>%
select(all_of(c(selected_lasso_vars, "FoodDesert"))) %>%
drop_na()
# Run VIF-based iterative filtering
final_selected_vars <- remove_high_vif_iteratively(model_data_reduced, target = "FoodDesert", threshold = 5)
cat("Final number of predictors:", length(final_selected_vars), "\n")
print(final_selected_vars)
remove_high_pvalue_iteratively <- function(data, target = "FoodDesert", threshold = 0.05) {
current_vars <- setdiff(names(data), target)
repeat {
temp_formula <- as.formula(paste(target, "~", paste0("`", current_vars, "`", collapse = " + ")))
temp_model <- glm(temp_formula, data = data, family = binomial())
model_summary <- tidy(temp_model)
# Remove intercept row
model_summary <- model_summary %>%
filter(term != "(Intercept)")
# Check max p-value
max_pval <- max(model_summary$p.value, na.rm = TRUE)
if (max_pval < threshold) {
break
}
worst_var <- model_summary %>%
filter(p.value == max_pval) %>%
pull(term) %>%
gsub("`", "", .) # remove backticks
cat("Removing variable:", worst_var, "p-value =", round(max_pval, 4), "\n")
current_vars <- setdiff(current_vars, worst_var)
}
return(current_vars)
}
model_data_reduced <- model_data_sampled %>%
select(all_of(c(final_selected_vars, "FoodDesert"))) %>%
drop_na()
final_significant_vars <- remove_high_pvalue_iteratively(model_data_reduced, target = "FoodDesert", threshold = 0.01)
cat("Final number of predictors after p-value filtering:", length(final_significant_vars), "\n")
print(final_significant_vars)
final_formula <- as.formula(paste("FoodDesert ~", paste0("`", final_significant_vars, "`", collapse = " + ")))
final_model <- glm(final_formula, data = model_data_reduced, family = binomial())
##
## Call:
## glm(formula = final_formula, family = binomial(), data = model_data_reduced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4968 0.0314 -79.56 < 2e-16 ***
## NUMGQTRS 0.1605 0.0196 8.17 3.2e-16 ***
## PovertyRate 0.4851 0.0289 16.79 < 2e-16 ***
## TractKids 0.2417 0.0379 6.38 1.8e-10 ***
## TractSeniors 0.2166 0.0308 7.03 2.1e-12 ***
## TractWhite -0.5274 0.0409 -12.91 < 2e-16 ***
## TractAsian -0.3717 0.0528 -7.04 1.9e-12 ***
## TractAIAN 0.0965 0.0296 3.26 0.00113 **
## TractSNAP 0.3322 0.0301 11.03 < 2e-16 ***
## PCT_NHNA10 -0.1189 0.0312 -3.81 0.00014 ***
## POVRATE15 -0.2068 0.0312 -6.63 3.4e-11 ***
## CH_FOODINSEC_14_17 -0.1470 0.0326 -4.51 6.4e-06 ***
## PCT_OBESE_ADULTS12 -0.0951 0.0270 -3.52 0.00043 ***
## GROCPTH11 -0.1915 0.0374 -5.12 3.1e-07 ***
## SUPERCPTH16 0.1005 0.0224 4.48 7.3e-06 ***
## CONVSPTH16 0.0886 0.0291 3.04 0.00234 **
## SPECSPTH11 -0.0944 0.0339 -2.78 0.00541 **
## FSRPTH11 0.0764 0.0292 2.62 0.00886 **
## PCT_LOCLFARM07 -0.1869 0.0522 -3.58 0.00035 ***
## PCT_LOCLSALE12 -0.0917 0.0338 -2.71 0.00674 **
## PC_DIRSALES07 0.0726 0.0222 3.27 0.00106 **
## FMRKT13 -0.1886 0.0441 -4.28 1.9e-05 ***
## PCT_FMRKT_SNAP18 0.0788 0.0274 2.88 0.00400 **
## PCT_FMRKT_CREDIT18 -0.0816 0.0264 -3.09 0.00199 **
## PCT_LACCESS_LOWI10 0.2188 0.0346 6.32 2.6e-10 ***
## PCH_LACCESS_HHNV_10_15 0.3866 0.1112 3.48 0.00051 ***
## PCT_LACCESS_HHNV10 0.1741 0.0313 5.57 2.6e-08 ***
## PCT_LACCESS_SNAP15 0.1822 0.0367 4.97 6.7e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16498 on 21758 degrees of freedom
## Residual deviance: 13145 on 21731 degrees of freedom
## AIC: 13201
##
## Number of Fisher Scoring iterations: 6
## [1] "VIF values of the coefficients:"
## PC_DIRSALES07 NUMGQTRS TractAsian
## 1.16 1.20 1.20
## SUPERCPTH16 PCH_LACCESS_HHNV_10_15 CH_FOODINSEC_14_17
## 1.21 1.23 1.27
## PCT_LOCLSALE12 PCT_OBESE_ADULTS12 FMRKT13
## 1.30 1.40 1.52
## GROCPTH11 SPECSPTH11 PCT_LOCLFARM07
## 1.52 1.53 1.55
## FSRPTH11 PCT_FMRKT_SNAP18 PCT_FMRKT_CREDIT18
## 1.56 1.69 1.73
## CONVSPTH16 TractSeniors POVRATE15
## 1.81 1.86 1.90
## PCT_LACCESS_HHNV10 PovertyRate TractAIAN
## 1.94 2.03 2.18
## PCT_NHNA10 TractSNAP TractKids
## 2.26 2.34 2.77
## PCT_LACCESS_LOWI10 TractWhite PCT_LACCESS_SNAP15
## 2.85 3.09 3.16
final_vars_with_target <- c(final_significant_vars, "FoodDesert")
final_data <- model_data_reduced %>%
select(all_of(final_vars_with_target))
final_data_numeric <- final_data %>%
mutate(across(everything(), ~ as.numeric(as.character(.))))
cor_matrix_final <- cor(final_data_numeric, use = "complete.obs", method = "spearman")
cor_with_target <- data.frame(
Variable = rownames(cor_matrix_final),
Correlation = cor_matrix_final[, "FoodDesert"]
)
cor_with_target <- cor_with_target %>%
filter(Variable != "FoodDesert")
cor_with_target <- cor_with_target %>%
mutate(abs_correlation = abs(Correlation))
top10_features <- cor_with_target %>%
arrange(desc(abs_correlation)) %>%
slice(1:5)
top10_var_names <- top10_features$Variable
cor_matrix_top10 <- cor_matrix_final[c(top10_var_names,"FoodDesert"), c(top10_var_names,"FoodDesert")]
corrplot(
cor_matrix_top10,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
number.cex = 0.8,
tl.cex = 0.9,
mar = c(0,0,2,0)
)
boxplot_data <- merged_df %>%
mutate(FoodDesert = as.factor(FoodDesert)) %>%
select(c(top10_var_names,"FoodDesert"))
for (var in top10_var_names) {
p <- ggplot(boxplot_data, aes(x = FoodDesert, y = .data[[var]])) +
geom_boxplot(fill = "green") +
labs(
title = paste("Boxplot of", var, "by Food Desert Status"),
x = "Food Desert (0 = No, 1 = Yes)",
y = var
) +
theme_minimal(base_size = 14)
print(p)
}
It looks like PovertyRate,Vehicle access and SNAP status are the most prominent factors